Prepare DOC data

Read raw data, clean it and compute averages per pixel.

Author

Thelma Panaïotis

source("utils.R")

Read data

#|warning: false
# Read column names and types
# NB: file generated based on what we want to read in the dataset, this is much faster than reading the whole dataset and then selecting columns
columns <- read_delim(
  "data/raw/0227166/columns.txt", 
  delim = "\t", 
  escape_double = FALSE, 
  trim_ws = TRUE,
  show_col_types = FALSE
)

# Read with appropriate column names and types
suppressWarnings(df_doc_raw <- read_excel(
    "data/raw/0227166/4.4/data/0-data/All_Basins_Data_Merged_Hansell_2022.xlsx",
    skip = 2, 
    col_names = columns$name,
    col_types = columns$type
  ))

# Select useful columns
df_doc <- df_doc_raw %>%
  select(
    bottle, bottle_flag_w,
    lon = longitude, lat = latitude, date,
    press = `ctd pressure`,
    doc, doc_flag_w
  )

Clean

Remove aberrant doc observations (flag = 9, negative values…).

df_doc <- df_doc %>%
  filter(!date %in% c(-999, 0)) %>% # missing data
  filter(press != -999) %>% # missing depth
  filter(!bottle_flag_w %in% c(-999, 9)) %>% # bottle flag is ok
  filter(doc_flag_w != 9) %>% # flag is ok
  filter(doc > 0) %>% # doc is positive
  drop_na(doc) #%>% # drop missing values
 # mutate(date = ymd(date)) # format date
summary(df_doc)
    bottle          bottle_flag_w           lon               lat         
 Length:98189       Length:98189       Min.   :-179.99   Min.   :-78.643  
 Class :character   Class :character   1st Qu.:-133.84   1st Qu.:-32.499  
 Mode  :character   Mode  :character   Median : -35.85   Median : 15.383  
                                       Mean   : -28.34   Mean   :  6.568  
                                       3rd Qu.:  65.12   3rd Qu.: 40.334  
                                       Max.   : 213.60   Max.   : 89.988  
     date               press             doc          doc_flag_w       
 Length:98189       Min.   :   0.0   Min.   : 13.30   Length:98189      
 Class :character   1st Qu.:  89.9   1st Qu.: 40.33   Class :character  
 Mode  :character   Median : 399.5   Median : 45.20   Mode  :character  
                    Mean   : 998.9   Mean   : 52.00                     
                    3rd Qu.:1442.1   3rd Qu.: 56.75                     
                    Max.   :6903.9   Max.   :724.75                     
ggplot(df_doc) + geom_bar(aes(x = doc_flag_w))

There is also a little cleaning to do with dates: the TRACERS (2013) cruise reported dates as 201303X , but after inspecting boat trajectory (available at https://www.bco-dmo.org/dataset/549090) and comparing with DOC sampling location, it appears that those dates should be 2013030X.

Moreover, some dates are YYYYMM00, let’s change them to YYYYMM01.

Let’s fix this.

df_doc <- df_doc %>% 
  mutate(
    # Fix for Tracers 2013
    date = ifelse( # if only 7 characters in date, one character is missing, add a 0 before the last character
      nchar(date == 7),
      paste0(str_extract(date, "^[[:digit:]]{6}"), "0", str_extract(date, "[[:digit:]]{1}$")),
      date
      ),
    # Fix for dates ending in 00
    date = str_replace(date, "00$", "01"),
    date = ymd(date) # format date
  )

Assign observations to layers

DOC observations are separated in 3 layers:

  • surface (depth ⩽ 10)

  • epi pelagic (depth > 10 & depth ⩽ 200)

  • mesopelagic (depth > 200 & depth ⩽ 1000)

  • surface (depth > 1000)

df_doc <- df_doc %>% 
  mutate(layer = case_when(
    press <= surface_bottom ~ "surf",
    press > surface_bottom & press <= meso_top ~ "epi",   
    press > meso_top & press <= meso_bottom ~ "meso",
    press > meso_bottom ~ "bathy",
  )) %>% 
  filter(!is.na(layer)) # drop observations not assigned to any layer

Average by pixel across the layers

Annual

df_doc_ann <- df_doc %>%
  mutate(
    # floor and add 0.5 because other env data are on the centre of each pixel
    lon = roundp(lon, precision = 1, f = floor) + 0.5,
    lat = roundp(lat, precision = 1, f = floor) + 0.5
  ) %>%
  group_by(lon, lat, layer) %>%
  summarise(
    # compute mean and sd
    doc_mean = mean(doc),
    doc_sd = sd(doc),
    n_obs = n(),
    .groups = "drop"
  ) 

Let’s plot some maps.

# Number of observations
ggmap(
  df_doc_ann, 
  "n_obs", 
  type = "point", 
  palette = ggplot2::scale_colour_viridis_c(trans = "log1p", direction = -1)
) +
  ggtitle("Number of observations") +
  facet_wrap(~layer, ncol = 2)

# DOC mean
ggmap(
  df_doc_ann, 
  "doc_mean", 
  type = "point", 
  palette = ggplot2::scale_colour_viridis_c(option = "F", trans = "log1p")
) + 
  ggtitle("DOC mean") +
  facet_wrap(~layer, ncol = 2)

# DOC sd
ggmap(
  df_doc_ann, 
  "doc_sd", 
  type = "point", 
  palette = ggplot2::scale_colour_viridis_c(option = "E", trans = "log1p", na.value = NA)
) + 
  ggtitle("DOC sd") +
  facet_wrap(~layer, ncol = 2)

Store layered doc in multiple columns, in a similar fashion as for env data, and add season.

df_doc_ann <- df_doc_ann %>% 
  select(lon, lat, layer, mean = doc_mean, sd = doc_sd) %>% 
  pivot_longer(mean:sd) %>% 
  unite("layer", layer:name) %>% 
  pivot_wider(names_from = "layer", values_from = "value") %>% 
  mutate(season = 0, .after = lat)

Seasonal (surface layer only)

Seasons are defined as:

  • season 1 = DJF

  • season 2 = MAM

  • season 3 = JJA

  • season 4 = SON

df_doc_s <- df_doc %>% 
  filter(layer == "surf") %>% 
  mutate(season = quarter(date, fiscal_start = 12)) %>% 
  # generate seasonal average by pixel
  mutate(
    # floor and add 0.5 because other env data are on the centre of each pixel
    lon = roundp(lon, precision = 1, f = floor) + 0.5,
    lat = roundp(lat, precision = 1, f = floor) + 0.5
  ) %>%
  group_by(lon, lat, season) %>%
  summarise(
    # compute mean and sd
    surf_mean = mean(doc),
    surf_sd = sd(doc),
    .groups = "drop"
  ) 

# Counts per season
ggplot(df_doc_s) + geom_bar(aes(x = season)) + scale_x_continuous(n.breaks = 3, expand = c(0,0))

And plot seasonal maps.

ggmap(df_doc_s, "surf_mean", type = "point", palette = ggplot2::scale_colour_viridis_c(option = "A", trans = "log10")) + 
  facet_wrap(~season) +
  ggtitle("DOC mean by season")

Few observations during winter at high latitudes. Will see what we can do.

Save DOC data

# Join annual and seasonnal observations together
df_doc <- bind_rows(df_doc_ann, df_doc_s) %>% 
  # and reorder columns
  select(
    lon, lat, season, 
    contains("surf"), 
    contains("epi"), 
    contains("meso"), 
    contains("bathy")
  )

### Some sanity checks
## seasonal data is only in surf #ok
#foo %>% filter(season != 0) %>% select(-contains(c("epi", "meso", "bathy"))) %>% summary()
#foo %>% filter(season != 0) %>% select(contains(c("epi", "meso", "bathy"))) %>% summary()
#
## annual data is in all 4 layers
#foo %>% filter(season == 0) %>% summary()
#
## All ok!

# Save
save(df_doc, file = "data/01.doc_data.Rdata")